date: 2022-05-16

Synopsis

This report is part of the evaluation of Johns Hopkins University - Data Science Capstone.

The goal of this project is to display the work with the data and preparation to create the prediction algorithm.

Load Data

if (!dir.exists("final")){
     
     if(!file.exists("Coursera-Swiftkey.zip")){
          download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip",
                        destfile = "Coursera-Swiftkey.zip")
     }
     
     unzip("Coursera-Swiftkey.zip")
}

text <- file("final/en_US/en_US.blogs.txt", open = "r")
blogs <- readLines(text, skipNul = T)
close(text)

text <- file("final/en_US/en_US.news.txt", open = "r")
news <- readLines(text, skipNul = T)
close(text)

text <- file("final/en_US/en_US.twitter.txt", open = "r")
twitter <- readLines(text, skipNul = T)
close(text)

rm("text")

Basic Summary

The three data sets are quite large and this report will focus on the Tokenizing steps of Natural Language Processing (NLP).

In this section, the data sets will be analyzed and summarized separately.

Below we can see

##      file     size characters    words   lines WPL.Min. WPL.1st.Qu. WPL.Median
## 1   Blogs 200.4 MB  208361438 38601176  899288        1           9         29
## 2    News 196.3 MB   15683765  2755797   77259        1          20         33
## 3 Twitter 159.4 MB  162385035 31130623 2360148        1           7         12
##   WPL.Mean WPL.3rd.Qu. WPL.Max.
## 1 42.92415          61     6851
## 2 35.66959          47     1522
## 3 13.19011          19       62

The code is available at the Appendix 1 section.

Exploratory Analysis

For better understanding, the data sets were converted into character vectors and then counted the frequency of appearance of every word.

The three character vectors formed a single vector using the full_join() function. In the end, a data frame was created with the words and the their respective sums of frequencies. To top 5 most frequent words were:

##     UW  rowsum
## 1    i 1374974
## 2  the  297714
## 3 just  253200
## 4 like  222915
## 5 will  215039

And here is a histogram of the 20 most frequent words

For a better visualization, below there is a Wordcloud from wordcloud2 library, which plots the words with its respective size directly proportional to its frequency

All the codes used to plot these data are available at Appendix 2 section.

Appendix

1. Basic Summary

file_MB <- round(file.info(c("en_US/en_US.blogs.txt",
                             "en_US/en_US.news.txt",
                             "en_US/en_US.twitter.txt"))$size / 1024^2,
                 digits = 1)

# Calculates the number of Characters in each line
numChars <- sapply(list(nchar(blogs), 
                        nchar(news), 
                        nchar(twitter)),
                   FUN = sum)

# Calculates the total number of lines in each data set
numLines <- sapply(list(blogs, 
                        news,
                        twitter),
                   FUN = length)

# Calculates the number of Words Per Line (WPL)
WPL <- sapply(list(blogs, news, twitter),
                   FUN = function(x){
                        str_count(string = x,
                                  pattern = "\\w+")
                   })

# Calculates the total of words of each data set
numWords <- sapply(X = WPL,
                   FUN = sum)

# Summarizes each data set by WPL
sumWPL <- sapply(X = WPL,
                 FUN = summary) %>% t()

mydf <- data.frame(file = c("Blogs", "News", "Twitter"),
                   size = paste(file_MB, "MB"), 
                   characters = numChars,
                   words = numWords,
                   lines = numLines,
                   WPL = sumWPL)
print(mydf)

2. Exploratory Analysis

# This function takes a string vector of words, clear out all stopwords and 
# returns a data frame of every word and its respective frequency.
table_words <- function(x) {
     require(dplyr); require(stringr); require(tm)
     
     UW <- str_remove_all(string = x,
                          pattern = '[[:punct:]]') %>%
          removeWords(words = stopwords('en')) %>%
          tolower() %>%
          str_split(pattern = " ") %>%
          unlist()
     
     UW <- UW[(grepl(pattern = "^[a-z]", UW))&(!grepl(pattern = "[^\x01-\x7F]+", UW))] # "[^\x01-\x7F]+" removes non-english characters 
     
     return(as.data.frame(sort(table(UW), decreasing = T)))
}

blogs_df <- table_words(blogs)
news_df <- table_words(news)
twitter_df <- table_words(twitter)

# Data frame with all three data sets words and frequencies
joined_df <- full_join(x = blogs_df[1:150,],
                       y = news_df[1:150,],
                       by = "UW") %>%
     full_join(y = twitter_df[1:150,],
               by = "UW") %>% rowwise() %>%
     mutate(rowsum = sum(Freq.x, Freq.y, Freq,
                         na.rm = T)) %>%
     select(UW, rowsum) %>% arrange(desc(rowsum)) %>%
     as.data.frame()

print(head(joined_df, 5))

library(ggplot2)
library(scales)
library(plotly)

g <- ggplot(head(joined_df, 20),
            aes(x = UW,
                y = rowsum,
                fill = UW)) +
     geom_histogram(stat = "identity") +
     theme(axis.text.x = element_text(angle = 90, 
                                      vjust = 0.5, 
                                      hjust=1),
           legend.position = "none",
           axis.title.x = element_blank()) +
     ylab("Frequency") +
     scale_y_continuous(labels = comma)

print(ggplotly(g))

library(wordcloud2)

print(wordcloud2(head(joined_df, 150), 
                 size = 3))